S3: pass fileprefix into getBucket calls
authorJoey Hess <joeyh@joeyh.name>
Mon, 10 Oct 2022 21:37:26 +0000 (17:37 -0400)
committerJoey Hess <joeyh@joeyh.name>
Mon, 10 Oct 2022 21:37:26 +0000 (17:37 -0400)
S3: Speed up importing from a large bucket when fileprefix= is set by only
asking for files under the prefix.

getBucket still returns the files with the prefix included, so the rest of
the fileprefix stripping still works unchanged.

Sponsored-by: Dartmouth College's DANDI project
CHANGELOG
Remote/S3.hs
doc/todo/allow_for_annonymous_AWS_S3_access/comment_8_b0d9dbe81f01e80809381a9e5f6a883d._comment [new file with mode: 0644]

index 8e8a0afb2b83711868222dd17d8aa153944ec52d..c393dad10169246e3d8fe444836ceed2aec9ba71 100644 (file)
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -6,6 +6,8 @@ git-annex (10.20221004) UNRELEASED; urgency=medium
     do not operate on a repository that has an empty name.
   * move: Fix openFile crash with -J
     (Fixes a reversion in 8.20201103)
+  * S3: Speed up importing from a large bucket when fileprefix= is set
+    by only asking for files under the prefix.
 
  -- Joey Hess <id@joeyh.name>  Mon, 03 Oct 2022 13:36:42 -0400
 
index 1f0ebd3d5a2f71172688d961ab73fed247ddd518..46a9bc49ce633c04acad3c203472913876cab23b 100644 (file)
@@ -216,7 +216,7 @@ gen r u rc gc rs = do
                                , renameExport = renameExportS3 hdl this rs info
                                }
                        , importActions = ImportActions
-                                { listImportableContents = listImportableContentsS3 hdl this info
+                                { listImportableContents = listImportableContentsS3 hdl this info c
                                , importKey = Nothing
                                 , retrieveExportWithContentIdentifier = retrieveExportWithContentIdentifierS3 hdl this rs info
                                 , storeExportWithContentIdentifier = storeExportWithContentIdentifierS3 hdl this rs info magic
@@ -548,8 +548,8 @@ renameExportS3 hv r rs info k src dest = Just <$> go
        srcobject = T.pack $ bucketExportLocation info src
        dstobject = T.pack $ bucketExportLocation info dest
 
-listImportableContentsS3 :: S3HandleVar -> Remote -> S3Info -> Annex (Maybe (ImportableContentsChunkable Annex (ContentIdentifier, ByteSize)))
-listImportableContentsS3 hv r info =
+listImportableContentsS3 :: S3HandleVar -> Remote -> S3Info -> ParsedRemoteConfig -> Annex (Maybe (ImportableContentsChunkable Annex (ContentIdentifier, ByteSize)))
+listImportableContentsS3 hv r info =
        withS3Handle hv $ \case
                Nothing -> giveup $ needS3Creds (uuid r)
                Just h -> Just <$> go h
@@ -558,6 +558,8 @@ listImportableContentsS3 hv r info =
                ic <- liftIO $ runResourceT $ extractFromResourceT =<< startlist h
                return (ImportableContentsComplete ic)
 
+       fileprefix = T.pack <$> getRemoteConfigValue fileprefixField c
+
        startlist h
                | versioning info = do
                        rsp <- sendS3Handle h $ 
@@ -565,7 +567,8 @@ listImportableContentsS3 hv r info =
                        continuelistversioned h [] rsp
                | otherwise = do
                        rsp <- sendS3Handle h $ 
-                               S3.getBucket (bucket info)
+                               (S3.getBucket (bucket info))
+                                       { S3.gbPrefix = fileprefix }
                        continuelistunversioned h [] rsp
 
        continuelistunversioned h l rsp
@@ -573,6 +576,7 @@ listImportableContentsS3 hv r info =
                        rsp' <- sendS3Handle h $
                                (S3.getBucket (bucket info))
                                        { S3.gbMarker = S3.gbrNextMarker rsp
+                                       , S3.gbPrefix = fileprefix
                                        }
                        continuelistunversioned h (rsp:l) rsp'
                | otherwise = return $
@@ -584,6 +588,7 @@ listImportableContentsS3 hv r info =
                                (S3.getBucketObjectVersions (bucket info))
                                        { S3.gbovKeyMarker = S3.gbovrNextKeyMarker rsp
                                        , S3.gbovVersionIdMarker = S3.gbovrNextVersionIdMarker rsp
+                                       , S3.gbovPrefix = fileprefix
                                        }
                        continuelistversioned h (rsp:l) rsp'
                | otherwise = return $
diff --git a/doc/todo/allow_for_annonymous_AWS_S3_access/comment_8_b0d9dbe81f01e80809381a9e5f6a883d._comment b/doc/todo/allow_for_annonymous_AWS_S3_access/comment_8_b0d9dbe81f01e80809381a9e5f6a883d._comment
new file mode 100644 (file)
index 0000000..7aa88f7
--- /dev/null
@@ -0,0 +1,27 @@
+[[!comment format=mdwn
+ username="joey"
+ subject="""comment 8"""
+ date="2022-10-10T21:04:49Z"
+ content="""
+I've finished the work on aws, which is in
+<https://github.com/aristidb/aws/pull/281> and I hope will be merged soon.
+
+git-annex now has a branch `anons3` that implements this, when
+the S3 remote is configured with signature=anonymous.
+
+       $ git-annex initremote s3-origin type=S3 importtree=yes encryption=none bucket=dandiarchive fileprefix=zarr-checksums/2ac71edb-738c-40ac-bd8c-8ca985adaa12/  signature=anonymous
+       initremote s3-origin (checking bucket...) ok
+       (recording state in git...)
+       $ git-annex import master --from s3-origin
+       list s3-origin ok
+       import s3-origin .checksum 
+       ok                                
+       import s3-origin 0/.checksum 
+       ok                                
+       import s3-origin 0/0/.checksum 
+       ok
+       ^C
+
+Also, I've fixed it to only list files in the fileprefix, which
+sped up the listing a *lot* in this bucket with many other files..
+"""]]